Main simulation

generate_cases = function(effect, freq, N, bias = 0, noise_xs = 0, contam=5){
  
  X_min = -5
  X_max = -X_min
  if(contam > X_max | contam < X_min) stop('contam has to be between X_max and X_min')
  
  # latent variable X
  xs = sort(runif(N, min = X_min, max = X_max)) + (X_max-contam)
  # transform to get normally distributed RV Y - plus noise in ranking
  ys = qnorm(rank(xs + rnorm(N, mean = 0, sd = noise_xs))/(N+1))
  p = inv.logit(xs) - bias

  effect_xs = effect * p
  freqs = freq * (1-effect_xs)
  g = rbinom(n = N, size = 1, prob = freqs)
  
  return(list(g, ys, freqs, xs))
}
# example
N = 2000
out = generate_cases(effect = 0.9, freq = 0.06, N = N, bias = 0, noise_xs = 0, contam = 4)
summary(glm(out[[1]] ~ out[[2]], family='binomial'))
## 
## Call:
## glm(formula = out[[1]] ~ out[[2]], family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4914  -0.2265  -0.1844  -0.1500   3.3242  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.0384     0.1827 -22.103  < 2e-16 ***
## out[[2]]     -0.6033     0.1613  -3.741 0.000183 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 399.91  on 1999  degrees of freedom
## Residual deviance: 385.65  on 1998  degrees of freedom
## AIC: 389.65
## 
## Number of Fisher Scoring iterations: 7
mv_avg = Np_fit(x = out[[1]], y = out[[2]], BBs = 100, lambda = 1)
plot((1:N)/N, colMeans(mv_avg), type='l')

plot(out[[4]], out[[3]], type='l')

Simulations

Sim parameters

Ns = round(10^(seq(2,4,length.out = 20)))
effect = .5
freq = .1
BBs = 10000
RUN_SIMS = F

Bias means that the allele frequency is in fact increased in individuals who are contaminants (not correct phenotype). Noise is added to the latent variable xs to get the ys

no noise & no bias

if(RUN_SIMS){
  tic()
  sim_out_all = foreach(n = Ns) %dopar% {
    
    sim_out = array(dim = c(BBs, 2))
    
    for(b in 1:BBs){
      controls = sample(0:1, size = n, replace = T, prob = c(1-freq, freq))
      cases = generate_cases(effect=effect, freq=freq, N=n, bias = 0,noise_xs = 0, contam = 4)
      out = c(rep(0,n), rep(1,n))
      sim_out[b,] = holmes_test_simple(y = c(rep(0,n), cases[[2]]), 
                                       g = c(controls, cases[[1]]), 
                                       case_ind = out==1)
    }
    sim_out
  }
  toc()
  save(sim_out_all, file = 'temp.RData')
} else {
  load(file = 'temp.RData')
}
par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3)
plot(log10(Ns), lapply(sim_out_all, function(x) mean(x[,1] < 0.001 )), 
     xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
     xlab='Sample size', ylab = 'Power for alpha = 0.001')
axis(1, at = 2:4, labels = 10^(2:4))
axis(1, at = log10(seq(100, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA) 

lines(log10(Ns), lapply(sim_out_all, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)

Same simulation but adding noise to xs

if(RUN_SIMS){
  tic()
  sim_out_all2 = foreach(n = Ns) %dopar% {
    
    sim_out = array(dim = c(BBs, 2))
    
    for(b in 1:BBs){
      controls = sample(0:1, size = n, replace = T, prob = c(1-freq, freq))
      cases = generate_cases(effect=effect, freq=freq, N=n, bias = 0, noise_xs = .5, contam = 4)
      out = c(rep(0,n), rep(1,n))
      sim_out[b,] = holmes_test_simple(y = c(rep(0,n), cases[[2]]), 
                                       g = c(controls, cases[[1]]), 
                                       case_ind = out==1)
    }
    sim_out
  }
  toc()
  save(sim_out_all2, file = 'temp2.RData')
} else {
  load(file = 'temp2.RData')
}
par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3)
plot(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,1] < 0.001 )), 
     xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
     xlab='Sample size', ylab = 'Power for alpha = 0.001')
axis(1, at = 2:4, labels = 10^(2:4))
axis(1, at = log10(seq(100, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA) 

lines(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)

Same simulation as 1 but adding bias

if(RUN_SIMS){
  tic()
  sim_out_all3 = foreach(n = Ns) %dopar% {
    
    sim_out = array(dim = c(BBs, 2))
    for(b in 1:BBs){
      controls = sample(0:1, size = n, replace = T, prob = c(1-freq, freq))
      cases = generate_cases(effect=effect, freq=freq, N=n, bias = 1/3, noise_xs = 0, contam = 4)
      out = c(rep(0,n), rep(1,n))
      sim_out[b,] = holmes_test_simple(y = c(rep(0,n), cases[[2]]), 
                                       g = c(controls, cases[[1]]), 
                                       case_ind = out==1)
    }
    sim_out
  }
  toc()
  save(sim_out_all3, file = 'temp3.RData')
} else {
  load(file = 'temp3.RData')
}
par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3)
plot(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,1] < 0.001 )), 
     xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
     xlab='Sample size', ylab = 'Power for alpha = 0.001')
axis(1, at = 2:4, labels = 10^(2:4))
axis(1, at = log10(seq(100, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA) 

lines(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)

par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3, mfrow=c(2,2))

my_cases = generate_cases(effect=effect, freq=freq, N=10^6, bias = 0, 
                          noise_xs = .5, contam = 4)
plot(my_cases[[4]], 100*my_cases[[3]], type='l', xlab='Latent variable',
     ylab='Frequency of marker (%)')
abline(h = freq*100, lty=2)
mtext(text = 'A', side = 3, line = 1, adj = 0)


plot(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,1] < 0.001 )), 
     xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
     xlab='Sample size', ylab = 'Power for alpha = 0.001', xlim=c(log10(300), 4))
xvals = c(300, 1000,3000,10000)
axis(1, at = log10(xvals), labels = xvals)
axis(1, at = log10(seq(300, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA) 
abline(h = 0.5, lty=2)
lines(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)
mtext(text = 'B', side = 3, line = 1, adj = 0)


my_cases = generate_cases(effect=effect, freq=freq, N=10^6, 
                          bias = 1/3, noise_xs = 0, contam = 4)
plot(my_cases[[4]], 100*my_cases[[3]], type='l', xlab='Latent variable',
     ylab='Frequency of marker (%)')
abline(h = freq*100, lty=2)
mtext(text = 'C', side = 3, line = 1, adj = 0)

plot(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,1] < 0.001 )), 
     xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
     xlab='Sample size', ylab = 'Power for alpha = 0.001', xlim=c(log10(300), 4))
axis(1, at = log10(xvals), labels = xvals)
axis(1, at = log10(seq(300, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA) 
abline(h = 0.5, lty=2)

lines(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)
mtext(text = 'D', side = 3, line = 1, adj = 0)